data2002 <- read.csv("~/Desktop/2002.csv")
data2022 <- read.csv("~/Desktop/2022.csv")Yiwei Gu HW1 092625
#1
read data 2002 and 2022:
check dimemsions, headers and tail:
dim(data2002)[1] 15976 22
dim(data2022)[1] 59918 22
head(data2002) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
1 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC
2 01/06/2002 AQS 60010007 1 31.6 ug/m3 LC
3 01/08/2002 AQS 60010007 1 21.4 ug/m3 LC
4 01/11/2002 AQS 60010007 1 25.9 ug/m3 LC
5 01/14/2002 AQS 60010007 1 34.5 ug/m3 LC
6 01/17/2002 AQS 60010007 1 41.0 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1 81 Livermore 1 100
2 93 Livermore 1 100
3 74 Livermore 1 100
4 82 Livermore 1 100
5 98 Livermore 1 100
6 115 Livermore 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
1 88101 PM2.5 - Local Conditions 120
2 88101 PM2.5 - Local Conditions 120
3 88101 PM2.5 - Local Conditions 120
4 88101 PM2.5 - Local Conditions 120
5 88101 PM2.5 - Local Conditions 120
6 88101 PM2.5 - Local Conditions 120
Method.Description CBSA.Code
1 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
2 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
3 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
4 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
5 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
6 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860
CBSA.Name State.FIPS.Code State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA 6 California 1
2 San Francisco-Oakland-Hayward, CA 6 California 1
3 San Francisco-Oakland-Hayward, CA 6 California 1
4 San Francisco-Oakland-Hayward, CA 6 California 1
5 San Francisco-Oakland-Hayward, CA 6 California 1
6 San Francisco-Oakland-Hayward, CA 6 California 1
County Site.Latitude Site.Longitude
1 Alameda 37.68753 -121.7842
2 Alameda 37.68753 -121.7842
3 Alameda 37.68753 -121.7842
4 Alameda 37.68753 -121.7842
5 Alameda 37.68753 -121.7842
6 Alameda 37.68753 -121.7842
tail(data2002) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
15971 12/10/2002 AQS 61131003 1 15 ug/m3 LC
15972 12/13/2002 AQS 61131003 1 15 ug/m3 LC
15973 12/22/2002 AQS 61131003 1 1 ug/m3 LC
15974 12/25/2002 AQS 61131003 1 23 ug/m3 LC
15975 12/28/2002 AQS 61131003 1 5 ug/m3 LC
15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
15971 62 Woodland-Gibson Road 1 100
15972 62 Woodland-Gibson Road 1 100
15973 6 Woodland-Gibson Road 1 100
15974 77 Woodland-Gibson Road 1 100
15975 28 Woodland-Gibson Road 1 100
15976 33 Woodland-Gibson Road 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
15971 88101 PM2.5 - Local Conditions 117
15972 88101 PM2.5 - Local Conditions 117
15973 88101 PM2.5 - Local Conditions 117
15974 88101 PM2.5 - Local Conditions 117
15975 88101 PM2.5 - Local Conditions 117
15976 88101 PM2.5 - Local Conditions 117
Method.Description CBSA.Code
15971 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15972 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15973 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15974 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15975 R & P Model 2000 PM2.5 Sampler w/WINS 40900
15976 R & P Model 2000 PM2.5 Sampler w/WINS 40900
CBSA.Name State.FIPS.Code State
15971 Sacramento--Roseville--Arden-Arcade, CA 6 California
15972 Sacramento--Roseville--Arden-Arcade, CA 6 California
15973 Sacramento--Roseville--Arden-Arcade, CA 6 California
15974 Sacramento--Roseville--Arden-Arcade, CA 6 California
15975 Sacramento--Roseville--Arden-Arcade, CA 6 California
15976 Sacramento--Roseville--Arden-Arcade, CA 6 California
County.FIPS.Code County Site.Latitude Site.Longitude
15971 113 Yolo 38.66121 -121.7327
15972 113 Yolo 38.66121 -121.7327
15973 113 Yolo 38.66121 -121.7327
15974 113 Yolo 38.66121 -121.7327
15975 113 Yolo 38.66121 -121.7327
15976 113 Yolo 38.66121 -121.7327
head(data2022) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
1 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC
2 01/02/2022 AQS 60010007 3 13.9 ug/m3 LC
3 01/03/2022 AQS 60010007 3 7.1 ug/m3 LC
4 01/04/2022 AQS 60010007 3 3.7 ug/m3 LC
5 01/05/2022 AQS 60010007 3 4.2 ug/m3 LC
6 01/06/2022 AQS 60010007 3 3.8 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
1 58 Livermore 1 100
2 60 Livermore 1 100
3 39 Livermore 1 100
4 21 Livermore 1 100
5 23 Livermore 1 100
6 21 Livermore 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
1 88101 PM2.5 - Local Conditions 170
2 88101 PM2.5 - Local Conditions 170
3 88101 PM2.5 - Local Conditions 170
4 88101 PM2.5 - Local Conditions 170
5 88101 PM2.5 - Local Conditions 170
6 88101 PM2.5 - Local Conditions 170
Method.Description CBSA.Code
1 Met One BAM-1020 Mass Monitor w/VSCC 41860
2 Met One BAM-1020 Mass Monitor w/VSCC 41860
3 Met One BAM-1020 Mass Monitor w/VSCC 41860
4 Met One BAM-1020 Mass Monitor w/VSCC 41860
5 Met One BAM-1020 Mass Monitor w/VSCC 41860
6 Met One BAM-1020 Mass Monitor w/VSCC 41860
CBSA.Name State.FIPS.Code State County.FIPS.Code
1 San Francisco-Oakland-Hayward, CA 6 California 1
2 San Francisco-Oakland-Hayward, CA 6 California 1
3 San Francisco-Oakland-Hayward, CA 6 California 1
4 San Francisco-Oakland-Hayward, CA 6 California 1
5 San Francisco-Oakland-Hayward, CA 6 California 1
6 San Francisco-Oakland-Hayward, CA 6 California 1
County Site.Latitude Site.Longitude
1 Alameda 37.68753 -121.7842
2 Alameda 37.68753 -121.7842
3 Alameda 37.68753 -121.7842
4 Alameda 37.68753 -121.7842
5 Alameda 37.68753 -121.7842
6 Alameda 37.68753 -121.7842
tail(data2022) Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units
59913 12/01/2022 AQS 61131003 1 3.4 ug/m3 LC
59914 12/07/2022 AQS 61131003 1 3.8 ug/m3 LC
59915 12/13/2022 AQS 61131003 1 6.0 ug/m3 LC
59916 12/19/2022 AQS 61131003 1 34.8 ug/m3 LC
59917 12/25/2022 AQS 61131003 1 23.2 ug/m3 LC
59918 12/31/2022 AQS 61131003 1 1.0 ug/m3 LC
Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete
59913 19 Woodland-Gibson Road 1 100
59914 21 Woodland-Gibson Road 1 100
59915 33 Woodland-Gibson Road 1 100
59916 99 Woodland-Gibson Road 1 100
59917 77 Woodland-Gibson Road 1 100
59918 6 Woodland-Gibson Road 1 100
AQS.Parameter.Code AQS.Parameter.Description Method.Code
59913 88101 PM2.5 - Local Conditions 145
59914 88101 PM2.5 - Local Conditions 145
59915 88101 PM2.5 - Local Conditions 145
59916 88101 PM2.5 - Local Conditions 145
59917 88101 PM2.5 - Local Conditions 145
59918 88101 PM2.5 - Local Conditions 145
Method.Description CBSA.Code
59913 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59914 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59915 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59916 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59917 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
59918 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900
CBSA.Name State.FIPS.Code State
59913 Sacramento--Roseville--Arden-Arcade, CA 6 California
59914 Sacramento--Roseville--Arden-Arcade, CA 6 California
59915 Sacramento--Roseville--Arden-Arcade, CA 6 California
59916 Sacramento--Roseville--Arden-Arcade, CA 6 California
59917 Sacramento--Roseville--Arden-Arcade, CA 6 California
59918 Sacramento--Roseville--Arden-Arcade, CA 6 California
County.FIPS.Code County Site.Latitude Site.Longitude
59913 113 Yolo 38.66121 -121.7327
59914 113 Yolo 38.66121 -121.7327
59915 113 Yolo 38.66121 -121.7327
59916 113 Yolo 38.66121 -121.7327
59917 113 Yolo 38.66121 -121.7327
59918 113 Yolo 38.66121 -121.7327
check variable names and types
str(data2002)'data.frame': 15976 obs. of 22 variables:
$ Date : chr "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 1 1 1 1 1 1 1 1 1 1 ...
$ Daily.Mean.PM2.5.Concentration: num 25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
$ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ Daily.AQI.Value : int 81 93 74 82 98 115 89 62 69 107 ...
$ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ...
$ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ Method.Code : int 120 120 120 120 120 120 120 120 120 120 ...
$ Method.Description : chr "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
$ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ...
$ State : chr "California" "California" "California" "California" ...
$ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ...
$ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ...
$ Site.Longitude : num -122 -122 -122 -122 -122 ...
str(data2022)'data.frame': 59918 obs. of 22 variables:
$ Date : chr "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
$ Source : chr "AQS" "AQS" "AQS" "AQS" ...
$ Site.ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
$ POC : int 3 3 3 3 3 3 3 3 3 3 ...
$ Daily.Mean.PM2.5.Concentration: num 12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
$ Units : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
$ Daily.AQI.Value : int 58 60 39 21 23 21 13 38 59 55 ...
$ Local.Site.Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
$ Daily.Obs.Count : int 1 1 1 1 1 1 1 1 1 1 ...
$ Percent.Complete : num 100 100 100 100 100 100 100 100 100 100 ...
$ AQS.Parameter.Code : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
$ AQS.Parameter.Description : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
$ Method.Code : int 170 170 170 170 170 170 170 170 170 170 ...
$ Method.Description : chr "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
$ CBSA.Code : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
$ CBSA.Name : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
$ State.FIPS.Code : int 6 6 6 6 6 6 6 6 6 6 ...
$ State : chr "California" "California" "California" "California" ...
$ County.FIPS.Code : int 1 1 1 1 1 1 1 1 1 1 ...
$ County : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
$ Site.Latitude : num 37.7 37.7 37.7 37.7 37.7 ...
$ Site.Longitude : num -122 -122 -122 -122 -122 ...
- closer look at the key variables
table(data2002$Daily.Mean.PM2.5.Concentration)
0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 1.1 1.2
3 7 18 23 19 28 31 21 30 35 74 32 28
1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4 2.5
25 24 29 25 43 29 38 97 26 45 36 39 32
2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8
37 38 42 40 167 43 44 46 29 48 46 51 49
3.9 4 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5 5.1
52 227 41 61 60 54 62 66 49 49 57 267 55
5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6 6.1 6.2 6.3 6.4
63 58 66 62 55 53 63 55 332 44 64 54 57
6.5 6.6 6.7 6.8 6.9 7 7.1 7.2 7.3 7.4 7.5 7.6 7.7
57 45 61 63 51 309 65 53 70 48 63 63 71
7.8 7.9 8 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9
58 55 302 51 63 49 43 65 55 56 61 73 290
9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 10 10.1 10.2 10.3
66 55 51 54 78 57 49 72 49 249 57 68 54
10.4 10.5 10.6 10.7 10.8 10.9 11 11.1 11.2 11.3 11.4 11.5 11.6
62 56 52 50 56 51 217 62 47 51 49 71 50
11.7 11.8 11.9 12 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8 12.9
55 58 39 209 46 54 53 40 54 42 45 62 50
13 13.1 13.2 13.3 13.4 13.5 13.6 13.7 13.8 13.9 14 14.1 14.2
190 50 47 55 45 61 53 49 45 40 177 47 43
14.3 14.4 14.5 14.6 14.7 14.8 14.9 15 15.1 15.2 15.3 15.4 15.5
38 42 49 38 57 46 48 138 40 43 50 38 45
15.6 15.7 15.8 15.9 16 16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8
47 46 39 38 129 35 34 37 36 36 32 35 29
16.9 17 17.1 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18 18.1
34 105 28 33 23 46 36 31 29 31 26 79 27
18.2 18.3 18.4 18.5 18.6 18.7 18.8 18.9 19 19.1 19.2 19.3 19.4
28 37 21 32 25 48 35 23 88 36 34 29 28
19.5 19.6 19.7 19.8 19.9 20 20.1 20.2 20.3 20.4 20.5 20.6 20.7
31 21 27 31 20 85 24 21 20 23 26 22 18
20.8 20.9 21 21.1 21.2 21.3 21.4 21.5 21.6 21.7 21.8 21.9 22
33 24 70 17 26 26 24 31 13 23 20 24 62
22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 22.9 23 23.1 23.2 23.3
15 21 24 21 27 31 23 27 12 68 20 25 20
23.4 23.5 23.6 23.7 23.8 23.9 24 24.1 24.2 24.3 24.4 24.5 24.6
14 18 29 19 20 13 55 17 20 11 13 24 23
24.7 24.8 24.9 25 25.1 25.2 25.3 25.4 25.5 25.6 25.7 25.8 25.9
15 17 13 40 14 12 19 12 25 9 22 15 23
26 26.1 26.2 26.3 26.4 26.5 26.6 26.7 26.8 26.9 27 27.1 27.2
31 17 24 21 15 29 11 20 25 12 48 16 12
27.3 27.4 27.5 27.6 27.7 27.8 27.9 28 28.1 28.2 28.3 28.4 28.5
24 3 11 15 11 13 8 42 14 11 11 8 16
28.6 28.7 28.8 28.9 29 29.1 29.2 29.3 29.4 29.5 29.6 29.7 29.8
7 12 10 11 21 13 12 12 10 13 11 9 18
29.9 30 30.1 30.2 30.3 30.4 30.5 30.6 30.7 30.8 30.9 31 31.1
12 23 11 14 7 12 11 9 7 11 6 24 9
31.2 31.3 31.4 31.5 31.6 31.7 31.8 31.9 32 32.1 32.2 32.3 32.4
3 13 7 15 8 12 8 12 31 10 2 10 15
32.5 32.6 32.7 32.8 32.9 33 33.1 33.2 33.3 33.4 33.5 33.6 33.7
12 12 8 10 5 30 6 9 9 8 5 9 3
33.8 33.9 34 34.1 34.2 34.3 34.4 34.5 34.6 34.7 34.8 34.9 35
12 4 31 10 9 8 7 11 6 11 5 5 20
35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 35.9 36 36.1 36.2 36.3
6 13 1 7 6 13 6 15 8 17 10 10 9
36.4 36.5 36.6 36.7 36.8 36.9 37 37.1 37.2 37.3 37.4 37.5 37.6
3 8 6 7 7 8 15 3 12 7 6 2 4
37.7 37.8 37.9 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9
6 3 4 25 9 9 2 4 4 4 6 9 3
39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 40 40.1 40.2
13 6 6 7 4 9 9 5 5 6 16 5 5
40.3 40.4 40.5 40.6 40.7 40.8 40.9 41 41.1 41.2 41.3 41.4 41.5
5 2 9 7 6 5 3 16 4 6 6 5 6
41.6 41.7 41.8 41.9 42 42.1 42.2 42.3 42.4 42.5 42.6 42.7 42.8
5 8 4 4 22 2 6 7 3 4 1 7 9
42.9 43 43.1 43.2 43.3 43.4 43.5 43.6 43.7 43.8 43.9 44 44.1
8 18 3 1 2 5 7 3 6 4 3 10 9
44.2 44.3 44.4 44.5 44.6 44.7 44.8 44.9 45 45.1 45.2 45.3 45.4
4 7 2 6 4 5 4 3 16 4 7 5 2
45.5 45.6 45.7 45.8 45.9 46 46.1 46.2 46.3 46.4 46.5 46.6 46.7
5 3 3 5 1 17 2 1 7 4 4 4 4
46.8 46.9 47 47.1 47.2 47.3 47.4 47.5 47.6 47.7 47.8 47.9 48
5 4 7 4 3 6 3 5 3 5 4 2 12
48.1 48.3 48.4 48.5 48.7 48.8 48.9 49 49.1 49.2 49.3 49.4 49.5
6 2 7 2 6 4 5 10 1 5 1 7 5
49.6 49.7 49.9 50 50.1 50.2 50.3 50.4 50.5 50.6 50.7 50.8 50.9
2 6 2 13 2 1 4 2 5 1 3 1 2
51 51.1 51.2 51.3 51.4 51.5 51.6 51.7 51.8 52 52.1 52.2 52.3
10 2 5 9 3 1 4 3 3 2 4 1 5
52.4 52.5 52.6 52.7 52.8 52.9 53 53.1 53.2 53.3 53.4 53.5 53.6
3 4 4 1 4 3 12 2 6 3 3 4 6
53.7 53.9 54 54.1 54.3 54.4 54.5 54.6 54.7 54.8 54.9 55 55.1
4 1 10 1 2 6 1 4 2 4 1 3 6
55.2 55.3 55.4 55.6 55.7 55.8 56 56.1 56.3 56.5 56.6 56.7 56.8
6 4 3 2 1 1 8 2 5 3 3 3 3
56.9 57 57.1 57.2 57.3 57.4 57.5 57.6 57.7 57.8 57.9 58 58.1
4 9 2 3 3 6 2 2 5 1 1 7 2
58.2 58.4 58.5 58.6 58.7 58.8 58.9 59 59.2 59.3 59.4 59.5 59.6
3 2 3 5 1 3 2 7 4 1 2 3 2
59.7 60 60.2 60.5 60.8 60.9 61 61.1 61.4 61.6 61.7 61.8 61.9
5 6 1 1 2 2 8 1 1 3 2 3 2
62 62.1 62.2 62.3 62.5 62.6 62.7 63 63.3 63.5 63.6 63.7 63.8
9 2 1 1 1 2 1 7 1 1 1 2 1
63.9 64 64.1 64.3 64.4 64.5 64.6 64.7 64.8 64.9 65 65.1 65.2
4 7 2 1 3 1 1 2 2 1 7 3 2
65.3 65.4 65.7 66 66.2 66.3 66.6 66.8 66.9 67 67.2 67.3 67.6
2 1 1 7 3 5 1 2 1 3 1 1 2
67.7 67.8 67.9 68 68.1 68.2 68.6 68.7 68.8 69 69.1 69.2 69.6
3 2 1 5 2 1 1 4 2 6 1 1 3
69.7 69.9 70 70.1 70.2 70.3 70.5 70.8 70.9 71 71.1 71.2 71.3
2 2 2 1 1 1 1 1 2 3 3 1 1
71.7 71.8 71.9 72 72.3 72.4 72.9 73 73.1 73.2 73.5 73.6 73.9
1 3 1 3 1 2 1 5 2 2 1 1 2
74 74.4 74.7 75 75.1 75.3 75.5 75.7 75.8 76 76.1 76.3 76.4
2 1 1 2 2 1 1 1 1 4 1 1 2
76.6 76.7 76.8 76.9 77 77.2 77.4 77.6 78 78.1 78.3 78.5 79.3
1 1 1 1 3 2 1 1 1 2 1 1 1
79.5 80 80.3 80.4 80.7 80.9 81 81.1 81.3 81.6 82 82.1 83
1 2 1 1 2 1 1 1 1 1 3 1 2
83.1 83.9 84 84.1 84.2 84.4 84.6 85 85.3 85.6 85.7 86 86.2
1 1 2 1 2 1 1 1 1 2 1 1 1
86.6 87 87.4 87.5 88 88.6 89.3 89.6 89.8 90.7 91 91.7 92.5
1 2 1 1 1 1 1 1 1 1 2 1 1
93.9 102.7 104.3
1 1 1
table(data2022$Daily.Mean.PM2.5.Concentration)
-6.7 -6.3 -5.1 -4.7 -4.1 -3.1 -3 -2.2 -2.1 -2 -1.9 -1.7 -1.5
1 1 1 2 1 1 1 2 1 1 2 1 1
-1.4 -1.3 -1.2 -1.1 -1 -0.9 -0.8 -0.7 -0.6 -0.5 -0.4 -0.3 -0.2
6 5 4 4 11 4 12 8 17 17 23 25 32
-0.1 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 1.1
31 128 49 89 95 83 158 142 177 170 154 261 180
1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1 2.2 2.3 2.4
283 226 239 330 302 344 323 343 474 384 418 391 404
2.5 2.6 2.7 2.8 2.9 3 3.1 3.2 3.3 3.4 3.5 3.6 3.7
501 414 545 432 447 607 498 610 510 477 616 541 589
3.8 3.9 4 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5
510 493 721 506 628 526 503 640 542 664 500 505 690
5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6 6.1 6.2 6.3
501 644 480 498 627 508 614 510 444 658 457 577 474
6.4 6.5 6.6 6.7 6.8 6.9 7 7.1 7.2 7.3 7.4 7.5 7.6
479 572 465 567 469 406 554 424 522 409 415 511 423
7.7 7.8 7.9 8 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9
512 358 375 487 366 476 394 338 462 373 458 371 359
9 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 10 10.1 10.2
401 343 403 372 345 394 325 404 295 294 383 283 346
10.3 10.4 10.5 10.6 10.7 10.8 10.9 11 11.1 11.2 11.3 11.4 11.5
292 288 324 276 300 241 265 312 261 301 235 215 288
11.6 11.7 11.8 11.9 12 12.1 12.2 12.3 12.4 12.5 12.6 12.7 12.8
207 249 223 201 248 196 243 206 184 203 170 209 158
12.9 13 13.1 13.2 13.3 13.4 13.5 13.6 13.7 13.8 13.9 14 14.1
178 222 159 233 175 165 180 135 176 114 144 168 132
14.2 14.3 14.4 14.5 14.6 14.7 14.8 14.9 15 15.1 15.2 15.3 15.4
142 145 125 123 119 136 137 123 146 104 111 109 99
15.5 15.6 15.7 15.8 15.9 16 16.1 16.2 16.3 16.4 16.5 16.6 16.7
132 105 129 91 91 120 84 76 82 81 106 80 89
16.8 16.9 17 17.1 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18
80 76 96 63 104 70 73 71 67 74 58 42 85
18.1 18.2 18.3 18.4 18.5 18.6 18.7 18.8 18.9 19 19.1 19.2 19.3
42 65 44 44 56 54 67 41 55 53 36 51 31
19.4 19.5 19.6 19.7 19.8 19.9 20 20.1 20.2 20.3 20.4 20.5 20.6
40 47 39 34 35 30 52 27 44 41 31 53 37
20.7 20.8 20.9 21 21.1 21.2 21.3 21.4 21.5 21.6 21.7 21.8 21.9
39 27 33 44 32 36 36 28 23 33 42 28 23
22 22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8 22.9 23 23.1 23.2
30 38 40 18 25 31 24 31 28 19 24 18 29
23.3 23.4 23.5 23.6 23.7 23.8 23.9 24 24.1 24.2 24.3 24.4 24.5
21 24 21 20 22 11 19 22 22 16 16 15 17
24.6 24.7 24.8 24.9 25 25.1 25.2 25.3 25.4 25.5 25.6 25.7 25.8
19 19 16 15 13 17 26 22 20 21 16 17 12
25.9 26 26.1 26.2 26.3 26.4 26.5 26.6 26.7 26.8 26.9 27 27.1
16 10 13 14 22 18 16 18 20 15 13 19 15
27.2 27.3 27.4 27.5 27.6 27.7 27.8 27.9 28 28.1 28.2 28.3 28.4
12 10 16 19 15 12 11 9 15 15 12 12 9
28.5 28.6 28.7 28.8 28.9 29 29.1 29.2 29.3 29.4 29.5 29.6 29.7
18 9 15 12 12 11 9 19 12 11 9 12 8
29.8 29.9 30 30.1 30.2 30.3 30.4 30.5 30.6 30.7 30.8 30.9 31
9 7 5 15 10 5 8 7 11 8 11 9 19
31.1 31.2 31.3 31.4 31.5 31.6 31.7 31.8 31.9 32 32.1 32.2 32.3
6 13 8 4 9 10 9 9 11 11 4 4 8
32.4 32.5 32.6 32.7 32.8 32.9 33 33.1 33.2 33.3 33.4 33.5 33.6
3 7 5 10 7 8 6 13 7 6 5 17 5
33.7 33.8 33.9 34 34.1 34.2 34.3 34.4 34.5 34.6 34.7 34.8 34.9
6 12 4 6 7 4 3 6 6 9 5 5 6
35 35.1 35.2 35.3 35.4 35.5 35.6 35.7 35.8 35.9 36 36.1 36.2
8 4 8 3 2 11 3 3 2 10 8 4 9
36.3 36.4 36.5 36.6 36.7 36.8 37 37.1 37.2 37.3 37.4 37.5 37.6
4 5 6 2 3 5 10 5 2 5 3 5 5
37.7 37.8 37.9 38 38.1 38.2 38.3 38.4 38.5 38.6 38.7 38.8 38.9
3 4 3 5 5 5 1 4 5 3 3 6 1
39 39.1 39.2 39.3 39.4 39.5 39.6 39.7 39.8 39.9 40 40.1 40.2
4 5 4 4 3 8 3 6 5 1 3 4 2
40.3 40.4 40.5 40.6 40.7 40.8 40.9 41 41.1 41.2 41.3 41.4 41.5
2 1 4 4 7 8 2 3 1 5 1 2 3
41.6 41.7 41.8 41.9 42 42.1 42.2 42.3 42.4 42.5 42.6 42.7 42.8
3 2 4 2 3 3 3 2 2 1 2 4 1
42.9 43 43.1 43.2 43.3 43.4 43.5 43.6 43.7 43.8 43.9 44 44.1
1 2 5 4 2 3 5 1 4 2 1 5 2
44.2 44.3 44.4 44.5 44.8 45 45.2 45.4 45.5 45.7 45.9 46.1 46.2
2 1 1 3 2 1 2 1 1 2 1 3 4
46.3 46.6 46.7 46.8 46.9 47 47.1 47.2 47.4 47.5 47.8 47.9 48
3 1 4 1 1 1 2 1 1 1 1 1 2
48.2 48.3 48.5 48.6 48.7 48.9 49 49.1 49.2 49.4 49.7 49.8 50
1 1 1 1 3 1 3 1 1 1 1 1 2
50.2 50.5 50.8 51.2 51.4 51.5 51.8 51.9 52.2 52.5 52.6 52.8 52.9
1 2 1 2 1 1 1 1 1 1 2 1 3
53.2 53.3 53.5 53.6 53.8 53.9 54 54.5 54.6 54.7 54.9 55.1 55.6
1 1 1 1 1 2 1 1 2 3 1 1 1
55.8 56 56.3 57.8 58.1 58.6 59 59.3 60 61.5 61.7 62.3 62.4
1 1 1 1 1 2 1 1 1 2 1 1 1
62.5 62.7 62.8 62.9 63 63.7 64 64.2 64.4 66.2 66.6 66.7 68.6
1 1 1 1 1 1 2 1 1 2 1 1 1
69 69.1 70 70.7 71.8 73 73.5 73.8 73.9 74 75.3 75.5 76.3
1 1 1 1 1 3 2 1 1 1 2 1 1
77.2 77.5 78 81 83.5 83.6 84.4 84.5 85.2 87.3 88.6 88.8 89.2
1 1 1 1 1 1 1 1 1 1 1 1 1
89.8 90.7 91.4 92.4 96.6 97.2 98.2 101.4 102.3 103 105 106.4 107.2
1 1 1 1 1 1 1 2 1 1 1 1 1
108 108.8 109.3 109.5 110.2 111.1 111.6 113.6 118.7 119.2 122 133.3 133.8
1 1 1 1 1 1 1 1 1 1 1 1 1
139.2 140.3 141.1 150.9 152.4 155.2 168.7 177.1 178.6 181.7 212.8 218.2 243.9
1 1 1 1 1 1 1 1 1 1 1 1 1
244.7 246.2 296.3 302.5
1 1 1 1
summary(data2002$Daily.Mean.PM2.5.Concentration) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 16.12 20.50 104.30
summary(data2022$Daily.Mean.PM2.5.Concentration) Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.700 4.100 6.800 8.414 10.700 302.500
- distribution of the key variable:
hist(data2002$Daily.Mean.PM2.5.Concentration)hist(data2022$Daily.Mean.PM2.5.Concentration)boxplot(data2002$Daily.Mean.PM2.5.Concentration)boxplot(data2022$Daily.Mean.PM2.5.Concentration)Summary: dataset 2002 has 15976 rows and 22 columns, dataset2022 has 59703 rows and 23 columns. The key variables are: Date Local / Date → the sampling date (daily), the daily average PM2.5 concentration (µg/m³). State Name / County Name / City Name → geographic identifiers. Site Num (sometimes along with County Code, Site Code) → identifies the monitoring site. Latitude / Longitude → location of the monitoring site (useful for mapping). Sample Duration → indicates if measurement is 24-hr, 1-hr, etc. (for PM2.5, often 24-hr avg). POC (Parameter Occurrence Code) → distinguishes multiple instruments at the same site. Among this, the daily average PM2.5 concentration (µg/m³) is the main variable we’re investigating in this assignment. The sumamry data shows dataset 2002 has mean value 16.12 ug/m3 and max. Maxvalue is 104.3ug/m3. Distribution: right-skewed, with many moderate values and some extreme outliers. However, dataset2022 has implausible values, the max value is 302.5 ug/m3 and mean is 8.414, and these -ve values would be removed in later section after combination of two datasets. Distribution: right-skewed, lower central tendency compared to 2002, but with occasional extreme peaks (likely wildfire smoke events). PM2.5 levels in 2022 are, on average, about half of 2002 levels, showing long-term improvement in California’s air quality. However, 2022 shows some very high outliers (>300 µg/m³), which probably correspond to episodic wildfire pollution events. overall, Data quality is good; no missing PM values. The key PM2.5 variable shows clear right-skewness in both years. Long-term air quality has improved substantially, though extreme pollution events remain visible in 2022.
#2 combine and rename:
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
data2002 <- data2002 %>%
mutate(Year = 2002)
data2022 <- data2022 %>%
mutate(Year = 2022)
data_all <- bind_rows(data2002, data2022)library(dplyr)
names(data_all)[names(data_all) == "Daily.Mean.PM2.5.Concentration"] <- "PM25"
names(data_all)[names(data_all) == "Site.Latitude"] <- "Latitude"
names(data_all)[names(data_all) == "Site.Longitude"] <- "Longitude"
names(data_all)[names(data_all) == "Site.ID"] <- "Site"
names(data_all) [1] "Date" "Source"
[3] "Site" "POC"
[5] "PM25" "Units"
[7] "Daily.AQI.Value" "Local.Site.Name"
[9] "Daily.Obs.Count" "Percent.Complete"
[11] "AQS.Parameter.Code" "AQS.Parameter.Description"
[13] "Method.Code" "Method.Description"
[15] "CBSA.Code" "CBSA.Name"
[17] "State.FIPS.Code" "State"
[19] "County.FIPS.Code" "County"
[21] "Latitude" "Longitude"
[23] "Year"
data_all <- data_all %>%
mutate(
issue = case_when(
is.na(PM25) ~ "Missing",
PM25 < 0 ~ "Negative",
TRUE ~ "Valid"
)
)
issue_summary <- data_all %>%
group_by(Year) %>%
summarise(
total = n(),
missing = sum(issue == "Missing"),
negative = sum(issue == "Negative"),
prop_missing = mean(issue == "Missing"),
prop_negative = mean(issue == "Negative")
)
issue_summary# A tibble: 2 × 6
Year total missing negative prop_missing prop_negative
<dbl> <int> <int> <int> <dbl> <dbl>
1 2002 15976 0 0 0 0
2 2022 59918 0 215 0 0.00359
No missing values in either year. Negative values appear only in 2022, though the proportion is very small (~0.36%). This suggests a slight increase in measurement or data entry errors in the later year, but overall data quality is good. Maximum PM2.5 value (~300) is plausible and kept.
library(dplyr)
library(ggplot2)
issue_plot_data <- data_all %>%
mutate(issue = case_when(
PM25 < 0 ~ "Negative",
TRUE ~ "Valid"
)) %>%
group_by(Year, issue) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Year) %>%
mutate(prop = count / sum(count))
ggplot(issue_plot_data, aes(x = factor(Year), y = prop, fill = issue)) +
geom_bar(stat = "identity", position = "stack") +
scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) +
labs(
title = "Proportion of Valid and Negative PM2.5 Values by Year",
x = "Year",
y = "Proportion",
fill = "Data Issue"
) +
theme_minimal(base_size = 14)#3:
library(leaflet)
pal <- colorFactor(
palette = c("purple", "green"),
domain = c(2002, 2022)
)
leaflet(data_all) %>%
addTiles() %>%
addCircleMarkers(
~Longitude, ~Latitude,
color = ~pal(Year),
radius = 5,
fillOpacity = 0.7,
popup = ~paste("Year:", Year)
) %>%
addLegend(
"bottomright",
pal = pal,
values = ~Year,
title = "Year",
opacity = 1
)2002 sites (purple): clustered around major urban areas and some regional monitoring stations. 2022 sites (green): many overlap with 2002 locations, but some new sites appear in additional regions, indicating expanded coverage. Observation: The overall spatial distribution is similar, but there are more monitoring sites in 2022, especially in previously under-monitored areas.
#5:
level1: state level analysis
data_all_clean <- data_all %>%
filter(PM25 >= 0)
state_summary <- data_all_clean %>%
group_by(Year) %>%
summarise(
mean_PM25 = mean(PM25, na.rm = TRUE),
median_PM25 = median(PM25, na.rm = TRUE),
sd_PM25 = sd(PM25, na.rm = TRUE),
min_PM25 = min(PM25, na.rm = TRUE),
max_PM25 = max(PM25, na.rm = TRUE),
.groups = "drop"
)
print("State-level summary:")[1] "State-level summary:"
print(state_summary)# A tibble: 2 × 6
Year mean_PM25 median_PM25 sd_PM25 min_PM25 max_PM25
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2002 16.1 12 13.9 0 104.
2 2022 8.45 6.8 7.63 0 302.
ggplot(data_all_clean, aes(x = PM25, fill = factor(Year))) +
geom_histogram(alpha = 0.6, position = "identity", bins = 50) +
scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
labs(title = "Distribution of PM2.5 in California by Year",
x = "PM2.5 (µg/m³)", fill = "Year") +
theme_minimal()ggplot(data_all_clean, aes(x = factor(Year), y = PM25, fill = factor(Year))) +
geom_boxplot() +
scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
labs(title = "PM2.5 Distribution in California by Year",
x = "Year", y = "PM2.5 (µg/m³)") +
theme_minimal()level2:
data_all_clean <- data_all %>%
filter(PM25 >= 0)
county_summary <- data_all_clean %>%
group_by(County, Year) %>%
summarise(
mean_PM25 = mean(PM25, na.rm = TRUE),
median_PM25 = median(PM25, na.rm = TRUE),
sd_PM25 = sd(PM25, na.rm = TRUE),
.groups = "drop"
)
print("County-level summary:")[1] "County-level summary:"
print(county_summary)# A tibble: 98 × 5
County Year mean_PM25 median_PM25 sd_PM25
<chr> <dbl> <dbl> <dbl> <dbl>
1 Alameda 2002 14.3 10 11.4
2 Alameda 2022 8.21 7 4.95
3 Butte 2002 14.8 11.5 11.7
4 Butte 2022 6.26 4.5 5.78
5 Calaveras 2002 9.9 8 6.50
6 Calaveras 2022 6.04 5 4.10
7 Colusa 2002 11.7 9 10.0
8 Colusa 2022 7.61 6.7 4.76
9 Contra Costa 2002 15.1 9.5 14.5
10 Contra Costa 2022 8.24 7.2 4.93
# ℹ 88 more rows
ggplot(data_all_clean, aes(x = reorder(County, PM25, FUN = median), y = PM25, fill = factor(Year))) +
geom_boxplot() +
scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
coord_flip() +
labs(title = "PM2.5 by County in California",
x = "County", y = "PM2.5 (µg/m³)", fill = "Year") +
theme_minimal()level3 site level:
la_sites <- data_all %>%
filter(County == "Los Angeles")
la_summary <- la_sites %>%
group_by(Site, Year) %>%
summarise(
mean_PM25 = mean(PM25, na.rm = TRUE),
median_PM25 = median(PM25, na.rm = TRUE),
sd_PM25 = sd(PM25, na.rm = TRUE),
.groups = "drop"
)
la_summary# A tibble: 25 × 5
Site Year mean_PM25 median_PM25 sd_PM25
<int> <dbl> <dbl> <dbl> <dbl>
1 60370002 2002 20.8 18.7 12.1
2 60370002 2022 9.72 9.65 4.39
3 60370016 2022 8.42 7.8 5.47
4 60371002 2002 24.0 21.6 12.7
5 60371103 2002 22.0 19.3 11.7
6 60371103 2022 11.6 10.9 4.57
7 60371201 2002 18.9 17.0 10.7
8 60371201 2022 10.7 10.3 4.56
9 60371301 2002 23.3 19.8 12.0
10 60371302 2022 13.0 11.9 6.22
# ℹ 15 more rows
ggplot(la_sites, aes(x = reorder(Site, PM25, FUN = median), y = PM25, fill = factor(Year))) +
geom_boxplot() +
scale_fill_manual(values = c("2002" = "blue", "2022" = "red")) +
coord_flip() +
labs(title = "PM2.5 by Monitoring Site in Los Angeles County",
x = "Site", y = "PM2.5 (µg/m³)", fill = "Year") +
theme_minimal()First plot: Statewide PM2.5 Distribution. X-axis: Year (2002 vs. 2022). Y-axis: PM2.5 concentration (µg/m³). Observations: Median PM2.5 has decreased from 2002 to 2022. The interquartile range (IQR) is narrower in 2022, suggesting less variability in PM2.5 across the state.Both years show some extreme outliers, but 2022 has more very high outliers (up to ~300 µg/m³), indicating occasional severe pollution events.
Second plot: The county-level boxplots show that PM2.5 is generally higher in urbanized counties, with Los Angeles and Riverside consistently above other counties. In most counties, 2022 PM2.5 values are slightly higher than 2002, and the variability is larger in more densely populated areas. The site-level boxplots in Los Angeles County reveal that some sites experience higher PM2.5 than others, reflecting local pollution sources. Temporal changes between 2002 and 2022 vary by site, with some showing increases, indicating that air quality trends are highly site-specific within the county.